train_data <- merge(traindemographics, trainperf, all.y = T, by = "customerid")
test_data <- merge(testdemographics, testperf, all.y = T, by = "customerid")
loan_data <- rbind(train_data[, set := "train"], test_data[, set := "test"], fill = T)
loan_data %>% head() %>%
datatable(options = list(scrollX= TRUE))dates <- c("birthdate" ,"approveddate", "creationdate" )
loan_data[, (dates) := lapply(.SD, as.Date), .SDcols = dates]
loan_data[, age := (as.numeric(approveddate - birthdate))/365]
loan_data[, aprove_month := month(approveddate)]
loan_data[, approve_day := wday(approveddate)]
loan_data[, approve_year := year(approveddate)]loan_data[!is.na(good_bad_flag), .N, by = .(good_bad_flag)] %>%
.[, perc := round(N/sum(N) * 100, 2)] %>%
ggplot(aes(good_bad_flag, perc, fill =good_bad_flag)) +
geom_bar(stat = "identity") +
geom_text(aes(good_bad_flag, perc, label = paste(perc, "%"),
vjust = .05, hjust = .5),
size = 4)+
theme_hc()+
labs(title = "Percentage of bad loans")+
scale_fill_colorblind(name = "")+
theme(legend.position = "none")ggplot(missing_perc, aes( reorder(variables, perc), perc))+
geom_bar(stat = "identity") +
theme_fivethirtyeight()+
coord_flip()loan_data[, loannumber := as.numeric(loannumber)]
missing_var_del <- missing_perc[perc>50, variables]
## KNN imputation
library(VIM)
loan_data[, (dates):= NULL]
loan_data[, referredby:= NULL]
loan_data <- kNN(loan_data,useImputedDist = FALSE, k =10)
setDT(loan_data)
nms_all <- names(loan_data)
nms_imp <- nms_all[grepl("_imp$", nms_all)]
loan_data[, (nms_imp) := lapply(.SD,
function(x) ifelse(x == FALSE, 0, 1)),
.SDcols = nms_imp]
col_sum_imp <- loan_data[, colSums(.SD), .SDcols = nms_imp]
col_sum_imp <- names(col_sum_imp[col_sum_imp == 0])
#var_importants <- fread("var_importanta.csv")
loan_data[, (col_sum_imp) := NULL]
loan_data %>% head() %>%
datatable(options = list(scrollX= TRUE))loan_data[, good_bad_flag := factor(good_bad_flag, levels = c("Bad", "Good"))]
nms_del1 <- c("set_imp", " good_bad_flag_imp",
"approve_year","aprove_month",
"year","systemloanid" )
loan_data[, (nms_del1) := NULL]
class_nms <- sapply(loan_data, class)
nums <- class_nms[class_nms == "numeric"] %>% names()
nums <- nums[!grepl("_imp|good_bad_flag", nums)]
zero_one <- function(x){
myvar <- (x - min(x))/(max(x) - min(x))
myvar
}
loan_data[, (nums) := lapply(.SD, zero_one), .SDcols = nums]
train_data <- loan_data[set == "train"]
train_data[, set:= NULL]
test_data <- loan_data[set == "test"]
test_data[, set:= NULL]## Model Cross validation
set.seed(100)
cv_fold <- createFolds(train_sampled$good_bad_flag, k = 10)
train_ctrl <- trainControl(method = "cv",
number = 10,
summaryFunction = twoClassSummary,
classProbs = TRUE,
allowParallel=T,
index = cv_fold,
verboseIter = FALSE,
savePredictions = TRUE,
search = "grid")
xgb_grid <- expand.grid(nrounds = c(50,100),
eta = 0.4,
max_depth = c(2,3),
gamma = c(0, .01),
colsample_bytree = c(0.6, .8, 1),
min_child_weight = 1,
subsample = c(.5, .8, 1))
ranger_grid <- expand.grid(splitrule = c("extratrees", "gini"),
mtry = c(10, 20, (ncol(train_data) - 2) ),
min.node.size = c(1, 5))
svm_grid <- expand.grid(C = c( 1, 3, 5, 20),
sigma = seq(0.001, 0.524 , length.out = 7))library(caret)
library(caretEnsemble)
library(tictoc)
#tuneGrid= xgb_grid
tic()
model_list <- caretList(
good_bad_flag~.,
data=train_sampled[, .SD, .SDcols = !"customerid"],
metric = "ROC",
trControl=train_ctrl,
tuneList = list(caretModelSpec(method="xgbTree",tuneGrid= xgb_grid ),
caretModelSpec(method = "svmRadial", tuneGrid = svm_grid),
caretModelSpec(method="ranger", tuneGrid= ranger_grid)
)
)
toc()## 184.56 sec elapsed
## $xgbTree
## eXtreme Gradient Boosting
##
## 1906 samples
## 25 predictor
## 2 classes: 'Bad', 'Good'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 190, 191, 191, 190, 190, 191, ...
## Resampling results across tuning parameters:
##
## max_depth gamma colsample_bytree subsample nrounds ROC Sens
## 2 0.00 0.6 0.5 50 0.6728847 0.6255205
## 2 0.00 0.6 0.5 100 0.6687239 0.6304200
## 2 0.00 0.6 0.8 50 0.7288039 0.6641100
## 2 0.00 0.6 0.8 100 0.7116424 0.6563050
## 2 0.00 0.6 1.0 50 0.7561605 0.6733212
## 2 0.00 0.6 1.0 100 0.7316727 0.6772908
## 2 0.00 0.8 0.5 50 0.6729151 0.6205124
## 2 0.00 0.8 0.5 100 0.6624098 0.6245945
## 2 0.00 0.8 0.8 50 0.7327469 0.6672589
## 2 0.00 0.8 0.8 100 0.7079180 0.6519899
## 2 0.00 0.8 1.0 50 0.7532971 0.6749470
## 2 0.00 0.8 1.0 100 0.7320716 0.6788064
## 2 0.00 1.0 0.5 50 0.6698014 0.6229604
## 2 0.00 1.0 0.5 100 0.6593348 0.6231912
## 2 0.00 1.0 0.8 50 0.7278392 0.6629464
## 2 0.00 1.0 0.8 100 0.7079542 0.6654007
## 2 0.00 1.0 1.0 50 0.7502369 0.6764688
## 2 0.00 1.0 1.0 100 0.7289987 0.6756579
## 2 0.01 0.6 0.5 50 0.6696325 0.6348530
## 2 0.01 0.6 0.5 100 0.6660304 0.6332225
## 2 0.01 0.6 0.8 50 0.7350222 0.6693601
## 2 0.01 0.6 0.8 100 0.7153165 0.6644654
## 2 0.01 0.6 1.0 50 0.7580501 0.6747213
## 2 0.01 0.6 1.0 100 0.7379402 0.6802074
## 2 0.01 0.8 0.5 50 0.6654309 0.6214350
## 2 0.01 0.8 0.5 100 0.6561968 0.6222613
## 2 0.01 0.8 0.8 50 0.7339612 0.6737868
## 2 0.01 0.8 0.8 100 0.7155544 0.6560712
## 2 0.01 0.8 1.0 50 0.7548866 0.6721542
## 2 0.01 0.8 1.0 100 0.7308095 0.6753082
## 2 0.01 1.0 0.5 50 0.6527683 0.6149096
## 2 0.01 1.0 0.5 100 0.6568427 0.6309993
## 2 0.01 1.0 0.8 50 0.7240758 0.6685423
## 2 0.01 1.0 0.8 100 0.7079965 0.6636481
## 2 0.01 1.0 1.0 50 0.7502187 0.6768185
## 2 0.01 1.0 1.0 100 0.7292334 0.6763572
## 3 0.00 0.6 0.5 50 0.6577138 0.6098955
## 3 0.00 0.6 0.5 100 0.6579090 0.6192249
## 3 0.00 0.6 0.8 50 0.7110202 0.6627174
## 3 0.00 0.6 0.8 100 0.6925403 0.6455771
## 3 0.00 0.6 1.0 50 0.7329705 0.6760058
## 3 0.00 0.6 1.0 100 0.7119700 0.6608549
## 3 0.00 0.8 0.5 50 0.6501829 0.6171174
## 3 0.00 0.8 0.5 100 0.6461245 0.6104793
## 3 0.00 0.8 0.8 50 0.7057669 0.6446497
## 3 0.00 0.8 0.8 100 0.6866213 0.6469789
## 3 0.00 0.8 1.0 50 0.7270526 0.6676116
## 3 0.00 0.8 1.0 100 0.7052232 0.6558381
## 3 0.00 1.0 0.5 50 0.6535350 0.6103603
## 3 0.00 1.0 0.5 100 0.6445891 0.6115307
## 3 0.00 1.0 0.8 50 0.6981641 0.6418469
## 3 0.00 1.0 0.8 100 0.6829766 0.6350843
## 3 0.00 1.0 1.0 50 0.7291631 0.6697104
## 3 0.00 1.0 1.0 100 0.7071115 0.6565366
## 3 0.01 0.6 0.5 50 0.6575633 0.6202778
## 3 0.01 0.6 0.5 100 0.6453809 0.6187602
## 3 0.01 0.6 0.8 50 0.7092909 0.6593326
## 3 0.01 0.6 0.8 100 0.6908117 0.6421964
## 3 0.01 0.6 1.0 50 0.7336854 0.6659801
## 3 0.01 0.6 1.0 100 0.7100205 0.6580553
## 3 0.01 0.8 0.5 50 0.6556257 0.6185275
## 3 0.01 0.8 0.5 100 0.6452680 0.6172464
## 3 0.01 0.8 0.8 50 0.7082274 0.6484968
## 3 0.01 0.8 0.8 100 0.6888123 0.6404494
## 3 0.01 0.8 1.0 50 0.7315376 0.6728584
## 3 0.01 0.8 1.0 100 0.7105410 0.6594552
## 3 0.01 1.0 0.5 50 0.6597036 0.6172513
## 3 0.01 1.0 0.5 100 0.6513857 0.6135165
## 3 0.01 1.0 0.8 50 0.7118478 0.6603852
## 3 0.01 1.0 0.8 100 0.6903015 0.6481455
## 3 0.01 1.0 1.0 50 0.7287420 0.6681953
## 3 0.01 1.0 1.0 100 0.7071734 0.6564200
## Spec
## 0.6156090
## 0.6135073
## 0.6670262
## 0.6426578
## 0.7096938
## 0.6680748
## 0.6271593
## 0.6032520
## 0.6855564
## 0.6490688
## 0.7011851
## 0.6690106
## 0.6214418
## 0.6013809
## 0.6681879
## 0.6383447
## 0.6995523
## 0.6614318
## 0.6115329
## 0.5997547
## 0.6782133
## 0.6530311
## 0.7123787
## 0.6718071
## 0.6200204
## 0.6034756
## 0.6744838
## 0.6578180
## 0.7023487
## 0.6663226
## 0.6084888
## 0.5983467
## 0.6576949
## 0.6441748
## 0.6993192
## 0.6622477
## 0.6097796
## 0.6032469
## 0.6533812
## 0.6336797
## 0.6691205
## 0.6439431
## 0.6018496
## 0.5978856
## 0.6468594
## 0.6228411
## 0.6630656
## 0.6413769
## 0.6091973
## 0.5891455
## 0.6442915
## 0.6222532
## 0.6721569
## 0.6525701
## 0.6027784
## 0.5900682
## 0.6405579
## 0.6302996
## 0.6755362
## 0.6476725
## 0.6101219
## 0.5898378
## 0.6526839
## 0.6329835
## 0.6687769
## 0.6511728
## 0.6166631
## 0.6026713
## 0.6541987
## 0.6307646
## 0.6721568
## 0.6526862
##
## Tuning parameter 'eta' was held constant at a value of 0.4
## Tuning
## parameter 'min_child_weight' was held constant at a value of 1
## ROC was used to select the optimal model using the largest value.
## The final values used for the model were nrounds = 50, max_depth = 2, eta
## = 0.4, gamma = 0.01, colsample_bytree = 0.6, min_child_weight = 1
## and subsample = 1.
##
## $svmRadial
## Support Vector Machines with Radial Basis Function Kernel
##
## 1906 samples
## 25 predictor
## 2 classes: 'Bad', 'Good'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 190, 191, 191, 190, 190, 191, ...
## Resampling results across tuning parameters:
##
## C sigma ROC Sens Spec
## 1 0.00100000 0.4422421 0.6618623 0.3382454
## 1 0.08816667 0.7086939 0.6597959 0.6335690
## 1 0.17533333 0.7352082 0.6774006 0.6674958
## 1 0.26250000 0.6962665 0.7022576 0.6216990
## 1 0.34966667 0.7400808 0.6733189 0.6819551
## 1 0.43683333 0.7373153 0.6618844 0.6887111
## 1 0.52400000 0.7342219 0.6515168 0.7003736
## 3 0.00100000 0.5142799 0.3957886 0.6104006
## 3 0.08816667 0.7690665 0.6757621 0.7382671
## 3 0.17533333 0.7623650 0.6896374 0.7107533
## 3 0.26250000 0.7537839 0.6913852 0.6899916
## 3 0.34966667 0.7465813 0.6841606 0.6867337
## 3 0.43683333 0.7405400 0.6820668 0.6756531
## 3 0.52400000 0.7350261 0.6539637 0.6959443
## 5 0.00100000 0.5111336 0.4664782 0.5415850
## 5 0.08816667 0.7713958 0.6841584 0.7343014
## 5 0.17533333 0.7573858 0.6927878 0.7027055
## 5 0.26250000 0.7484558 0.6905698 0.6826452
## 5 0.34966667 0.7422423 0.6734292 0.6884862
## 5 0.43683333 0.7366419 0.6821707 0.6678409
## 5 0.52400000 0.7323630 0.6729630 0.6736692
## 20 0.00100000 0.6034719 0.5483381 0.5842606
## 20 0.08816667 0.7524496 0.6948853 0.6916302
## 20 0.17533333 0.7440833 0.6894071 0.6758915
## 20 0.26250000 0.7390002 0.6813623 0.6771745
## 20 0.34966667 0.7343741 0.6814779 0.6679639
## 20 0.43683333 0.7307089 0.6771677 0.6664536
## 20 0.52400000 0.7266876 0.6686620 0.6708850
##
## ROC was used to select the optimal model using the largest value.
## The final values used for the model were sigma = 0.08816667 and C = 5.
##
## $ranger
## Random Forest
##
## 1906 samples
## 25 predictor
## 2 classes: 'Bad', 'Good'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 190, 191, 191, 190, 190, 191, ...
## Resampling results across tuning parameters:
##
## splitrule mtry min.node.size ROC Sens Spec
## extratrees 10 1 0.8165927 0.7037461 0.7881626
## extratrees 10 5 0.8168939 0.7000141 0.7932919
## extratrees 20 1 0.8196030 0.7035087 0.7931762
## extratrees 20 5 0.8191942 0.6941828 0.8070530
## extratrees 25 1 0.8193723 0.6998969 0.7991231
## extratrees 25 5 0.8179915 0.6942990 0.8054191
## gini 10 1 0.7892622 0.6964107 0.7264911
## gini 10 5 0.7920299 0.6989754 0.7320894
## gini 20 1 0.7787087 0.6824193 0.7220609
## gini 20 5 0.7778494 0.6806713 0.7221780
## gini 25 1 0.7759767 0.6851025 0.7186807
## gini 25 5 0.7763475 0.6811368 0.7198466
##
## ROC was used to select the optimal model using the largest value.
## The final values used for the model were mtry = 20, splitrule = extratrees
## and min.node.size = 1.
##
## attr(,"class")
## [1] "caretList"
nms_models <- names(model_list)
resamples_stat_list <- list()
for (j in 1:length(nms_models)) {
model1 = model_list[[j]]
resample_stata <- thresholder(model1,
threshold = seq(.0, 1, by = 0.01),
final = TRUE,
statistics = "all") %>% setDT()
p= ggplot(resample_stata , aes(x = prob_threshold, y = F1, col = "F1")) +
geom_point() +
geom_point(aes(y = Sensitivity, col = "Sensitivity"))+
scale_x_continuous(breaks = seq(0, 1, by =.1))+
ggtitle(nms_models[j])
print(p)
resample_stata[, model:= nms_models[j]]
resamples_stat_list[[j]] = resample_stata
}resamples_combined <- rbindlist(resamples_stat_list, fill = TRUE)
library(plotly)
ggplotly(ggplot(resamples_combined , aes(x = 1-Specificity, y = Recall, color = model)) +
geom_line(size = 1) +
#geom_point(aes(y = Sensitivity, col = "Sensitivity"))+
scale_x_continuous(breaks = seq(0, 1, by =.1)) +
ggtitle(paste0("ROC for models"))+
scale_color_viridis_d())library(iml)
X_pred <-train_sampled[, .SD, .SDcols = !c("customerid", "good_bad_flag")] %>%
as.data.frame()
nms_models <- names(model_list)
iml_models <- list()
for (i in 1:length(nms_models)) {
chain_rf_a <- model_list[[i]]
pred <- function(chain_rf_a, train_sampled) {
results <- predict(chain_rf_a, newdata = train_sampled, type = "prob")
return(results[[1L]])
}
# it does not know how to deal with char values
# get predicted values
iml_models[[i]] <- Predictor$new(model = chain_rf_a,
data =X_pred,
predict.function = pred,
y = train_sampled$good_bad_flag)
}plots <- list()
for (i in 1:length(nms_models)) {
model_this = iml_models[[i]]
impa <- FeatureImp$new(model_this, loss = "ce")
var_importanta <-impa$results %>% data.table()
#write.csv(var_importanta, file = "var_importanta.csv", row.names = F)
setorder(var_importanta, -importance)
var10a <- var_importanta[1:20]
if(i == 2) write.csv(var10a, file = "svm_var.csv", row.names = F)
plots[[i]] <- ggplot(var10a, aes(reorder(feature,importance), importance))+
geom_point()+
ggtitle(nms_models[i])+
geom_linerange(aes(ymin=importance.05, ymax= importance.95), width=.3,
position=position_dodge(width = .7)) +
coord_flip()
}
plots## [[1]]
##
## [[2]]
##
## [[3]]
nms <- names(model_list)
ids <- which(nms == "ranger")
shap_list <- vector("list", nrow(X_pred))
model_list_shap <- list()
model_this <- iml_models[[ids]]
tic()
#shap_list[[1]] <- shap_import
for (i in 1:nrow(X_pred)) {
shap <- Shapley$new(model_this, x.interest = X_pred[i, ], sample.size = 30)
shap_import <-shap$results %>% data.table()
shap_import <- shap_import[class == "Bad"]
shap_list[[i]] <- shap_import[,
customerid := train_sampled[i, customerid]]
}
toc()## 17730.95 sec elapsed
library(ggforce)
shap_values <- fread("shap_values.csv")
shap_values[, phi2 := abs(phi)]
shap_imp <- shap_values[, .(Med = median(phi2),
Mean = mean(phi2)), by = feature] %>%
setorder(-Mean)
shap_imp <- shap_imp[1:20, ]
shap_values <- shap_values[feature %in%shap_imp$feature]
shap_values[, feature := factor(feature, levels = rev(shap_imp$feature) )]
ggplot(shap_values, aes(feature, phi, color = phi.var))+
geom_sina()+
geom_hline(yintercept = 0) +
scale_color_gradient(low="#2187E3", high="#F32858",
breaks=c(0,1), labels=c("Low","High"))+
theme_bw() +
theme(axis.line.y = element_blank(),
axis.ticks.y = element_blank(), # remove axis line
legend.position="bottom") +
coord_flip()